#############
#############
#############
## Stefan Müller and Sven-Oliver Proksch:
## Nostalgia in European Party Politics:
## A Text-Based Measurement Approach
## British Journal of Political Science
##
## Python script to apply nostalgia DistilBERT classifier in batches
#############
#############
#############

## If you would like to apply the fine-tuned
## classifier to your own data, check the following file:
## 07_tutorial_classify_nostalgia_distilbert.ipynb

# https://stackoverflow.com/a/67340183
# clean environment
globals().clear()

# Make sure the folder distilbert_nostalgic is in the same folder as this file
# To preserve the folder structure, download all files from the Dataverse at once.
# Alternatively, move config.json, pytorch_model.bin, and training_args.bin into a folder named distilbert_nostalgic.

## install packages

# !pip install datasets
# !pip install transformers
# !pip install sklearn
# !pip3 install torch torchvision
# !pip install psutil

## load packages and functions
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer
import numpy as np

import torch
import pandas as pd
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

## load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

## preprocessing functions
def transform_labels(label):
    label = label[label_train]
    return {'labels': label}


## tokenization function
## (padding and trunctuation ensure that very short/long texts are considered)
def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

## load pre-trained DistilBERT nostalgia model
model_nostalgia = AutoModelForSequenceClassification.from_pretrained("distilbert_nostalgic",
                                                                     num_labels=2)



## 1. Pre-processing

## determine category for training and prediction
label_train = "nostalgic"

# load training dataset
dataset_dict = load_dataset("csv", data_files="data_coded_train.csv")
dataset = dataset_dict["train"]

# should have 960 rows
print(dataset)

## tokenize text
dataset = dataset.map(tokenize_data, batched=True)

# Remove unnecessary columns and ensure labels in correct format
remove_columns = ["text", "countryname", "translation_inaccurate", "nostalgic"]

dataset = dataset.map(transform_labels, remove_columns = remove_columns)

## split data into training and evaluation sets
dataset = dataset.train_test_split(test_size = 0.2, shuffle = False)

## shuffle = FALSE ensures that we use the first 960 observations for training

train_dataset = dataset['train']
train_dataset
# 768 observations

eval_dataset = dataset['test']
eval_dataset
# 192 observations

## fine tune model based on F1 scores and 3 epochs
## https://huggingface.co/docs/transformers/training
metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(output_dir="test_trainer",
evaluation_strategy="epoch", per_device_eval_batch_size=128)

## https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets

## 2. Train downstream task

## build trainer
trainer = Trainer(
    model=model_nostalgia,
    args=training_args,
    train_dataset=train_dataset, # use traning set
    eval_dataset=eval_dataset, # use evaluation dataset for fine-tuning
    compute_metrics=compute_metrics, # consider f1 score for fine-tuning
)


## fill in path to unlabelled data


### BATCH #1

unlab_data_path_1 = "dat_full_bert1.csv"

unlab_dataset_1 = load_dataset("csv",
                    data_files=unlab_data_path_1)

# should be 200,000 documents
print(unlab_dataset_1)

## preprocess unlabelled data in same way as training data
unlab_dataset_1 = unlab_dataset_1.map(tokenize_data, batched=True)

## use model to predict probabilities for each label
preds_1 = trainer.predict(unlab_dataset_1["train"])
pred_labels_1 = np.argmax(preds_1[0], axis=1)

# merge labels with dataset and save as CSV
output = pd.read_csv(unlab_data_path_1)
output[label_train + "_bert"] = pred_labels_1
output.to_csv("data_classified_bert_1.csv")



### BATCH #2

unlab_data_path_2 = "dat_full_bert2.csv"

unlab_dataset_2 = load_dataset("csv", data_files=unlab_data_path_2)

# should be 200,000 documents
print(unlab_dataset_2)

## preprocess unlabelled data in same way as training data
unlab_dataset_2 = unlab_dataset_2.map(tokenize_data, batched=True)

## use model to predict probabilities for each label
preds_2 = trainer.predict(unlab_dataset_2["train"])
pred_labels_2 = np.argmax(preds_2[0], axis=1)

# merge labels with dataset and save as CSV
output = pd.read_csv(unlab_data_path_2)
output[label_train + "_bert"] = pred_labels_2
output.to_csv("data_classified_bert_2.csv")


### BATCH #3

unlab_data_path_3 = "dat_full_bert3.csv"

unlab_dataset_3 = load_dataset("csv", data_files=unlab_data_path_3)

# should be 200,000 documents
print(unlab_dataset_3)

## preprocess unlabelled data in same way as training data
unlab_dataset_3 = unlab_dataset_3.map(tokenize_data, batched=True)

## use model to predict probabilities for each label
preds_3 = trainer.predict(unlab_dataset_3["train"])
pred_labels_3 = np.argmax(preds_3[0], axis=1)

# merge labels with dataset and save as CSV
output = pd.read_csv(unlab_data_path_3)
output[label_train + "_bert"] = pred_labels_3
output.to_csv("data_classified_bert_3.csv")



### BATCH #4

unlab_data_path_4 = "dat_full_bert4.csv"

unlab_dataset_4 = load_dataset("csv",
                    data_files=unlab_data_path_4)

# should be 200,000 documents
print(unlab_dataset_4)

## preprocess unlabelled data in same way as training data
unlab_dataset_4 = unlab_dataset_4.map(tokenize_data, batched=True)

## use model to predict probabilities for each label
preds_4 = trainer.predict(unlab_dataset_4["train"])
pred_labels_4 = np.argmax(preds_4[0], axis=1)

# merge labels with dataset and save as CSV
output = pd.read_csv(unlab_data_path_4)
output[label_train + "_bert"] = pred_labels_4
output.to_csv("data_classified_bert_4.csv")


## BATCH 5

unlab_data_path_5 = "dat_full_bert5.csv"

unlab_dataset_5 = load_dataset("csv",
                    data_files=unlab_data_path_5)

# should be 200,000 documents
print(unlab_dataset_5)

## preprocess unlabelled data in same way as training data
unlab_dataset_5 = unlab_dataset_5.map(tokenize_data, batched=True)

## use model to predict probabilities for each label
preds_5 = trainer.predict(unlab_dataset_5["train"])
pred_labels_5 = np.argmax(preds_5[0], axis=1)

# merge labels with dataset and save as CSV
output = pd.read_csv(unlab_data_path_5)
output[label_train + "_bert"] = pred_labels_5
output.to_csv("data_classified_bert_5.csv")




## BATCH 6

unlab_data_path_6 = "dat_full_bert6.csv"

unlab_dataset_6 = load_dataset("csv",
                    data_files=unlab_data_path_6)

# should NOT be exactly 200,000 documents
# since it's the last batch of the data frame
print(unlab_dataset_6)

# num_rows: 192675 (-> correct!)

## preprocess unlabelled data in same way as training data
unlab_dataset_6 = unlab_dataset_6.map(tokenize_data, batched=True)

## use model to predict probabilities for each label
preds_6 = trainer.predict(unlab_dataset_6["train"])
pred_labels_6 = np.argmax(preds_6[0], axis=1)

# merge labels with dataset and save as CSV
output = pd.read_csv(unlab_data_path_6)
output[label_train + "_bert"] = pred_labels_6
output.to_csv("data_classified_bert_6.csv")
